// +build arm64,!noasm

#include "textflag.h"

TEXT ·cmovP751(SB), NOSPLIT, $0-17
	MOVD	x+0(FP), R0
	MOVD	y+8(FP), R1
	MOVB	choice+16(FP), R2

	// Set flags
	// If choice is not 0 or 1, this implementation will swap completely
	CMP	$0, R2

	LDP	0(R0), (R3, R4)
	LDP	0(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 0(R0)

	LDP	16(R0), (R3, R4)
	LDP	16(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 16(R0)

	LDP	32(R0), (R3, R4)
	LDP	32(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 32(R0)

	LDP	48(R0), (R3, R4)
	LDP	48(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 48(R0)

	LDP	64(R0), (R3, R4)
	LDP	64(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 64(R0)

	LDP	80(R0), (R3, R4)
	LDP	80(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 80(R0)

	RET

TEXT ·cswapP751(SB), NOSPLIT, $0-17
	MOVD	x+0(FP), R0
	MOVD	y+8(FP), R1
	MOVB	choice+16(FP), R2

	// Set flags
	// If choice is not 0 or 1, this implementation will swap completely
	CMP	$0, R2

	LDP	0(R0), (R3, R4)
	LDP	0(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 0(R0)
	CSEL	NE, R3, R5, R9
	CSEL	NE, R4, R6, R10
	STP	(R9, R10), 0(R1)

	LDP	16(R0), (R3, R4)
	LDP	16(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 16(R0)
	CSEL	NE, R3, R5, R9
	CSEL	NE, R4, R6, R10
	STP	(R9, R10), 16(R1)

	LDP	32(R0), (R3, R4)
	LDP	32(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 32(R0)
	CSEL	NE, R3, R5, R9
	CSEL	NE, R4, R6, R10
	STP	(R9, R10), 32(R1)

	LDP	48(R0), (R3, R4)
	LDP	48(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 48(R0)
	CSEL	NE, R3, R5, R9
	CSEL	NE, R4, R6, R10
	STP	(R9, R10), 48(R1)

	LDP	64(R0), (R3, R4)
	LDP	64(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 64(R0)
	CSEL	NE, R3, R5, R9
	CSEL	NE, R4, R6, R10
	STP	(R9, R10), 64(R1)

	LDP	80(R0), (R3, R4)
	LDP	80(R1), (R5, R6)
	CSEL	EQ, R3, R5, R7
	CSEL	EQ, R4, R6, R8
	STP	(R7, R8), 80(R0)
	CSEL	NE, R3, R5, R9
	CSEL	NE, R4, R6, R10
	STP	(R9, R10), 80(R1)

	RET

TEXT ·addP751(SB), NOSPLIT, $0-24
	MOVD	z+0(FP), R2
	MOVD	x+8(FP), R0
	MOVD	y+16(FP), R1

	// Load first summand into R3-R14
	// Add first summand and second summand and store result in R3-R14
	LDP	0(R0), (R3, R4)
	LDP	0(R1), (R15, R16)
	LDP	16(R0), (R5, R6)
	LDP	16(R1), (R17, R19)
	ADDS	R15, R3
	ADCS	R16, R4
	ADCS	R17, R5
	ADCS	R19, R6

	LDP	32(R0), (R7, R8)
	LDP	32(R1), (R15, R16)
	LDP	48(R0), (R9, R10)
	LDP	48(R1), (R17, R19)
	ADCS	R15, R7
	ADCS	R16, R8
	ADCS	R17, R9
	ADCS	R19, R10

	LDP	64(R0), (R11, R12)
	LDP	64(R1), (R15, R16)
	LDP	80(R0), (R13, R14)
	LDP	80(R1), (R17, R19)
	ADCS	R15, R11
	ADCS	R16, R12
	ADCS	R17, R13
	ADC	R19, R14

	// Subtract 2 * p751 in R15-R24 from the result in R3-R14
	LDP	·P751x2+0(SB), (R15, R16)
	SUBS	R15, R3
	SBCS	R16, R4
	LDP	·P751x2+40(SB), (R17, R19)
	SBCS	R16, R5
	SBCS	R16, R6
	SBCS	R16, R7
	LDP	·P751x2+56(SB), (R20, R21)
	SBCS	R17, R8
	SBCS	R19, R9
	LDP	·P751x2+72(SB), (R22, R23)
	SBCS	R20, R10
	SBCS	R21, R11
	MOVD	·P751x2+88(SB), R24
	SBCS	R22, R12
	SBCS	R23, R13
	SBCS	R24, R14
	SBC	ZR, ZR, R25

	// If x + y - 2 * p751 < 0, R25 is 1 and 2 * p751 should be added
	AND	R25, R15
	AND	R25, R16
	AND 	R25, R17
	AND	R25, R19
	AND	R25, R20
	AND	R25, R21
	AND 	R25, R22
	AND	R25, R23
	AND	R25, R24

	ADDS	R15, R3
	ADCS	R16, R4
	STP	(R3, R4), 0(R2)
	ADCS	R16, R5
	ADCS	R16, R6
	STP	(R5, R6), 16(R2)
	ADCS	R16, R7
	ADCS	R17, R8
	STP	(R7, R8), 32(R2)
	ADCS	R19, R9
	ADCS	R20, R10
	STP	(R9, R10), 48(R2)
	ADCS	R21, R11
	ADCS	R22, R12
	STP	(R11, R12), 64(R2)
	ADCS	R23, R13
	ADC	R24, R14
	STP	(R13, R14), 80(R2)

	RET

TEXT ·subP751(SB), NOSPLIT, $0-24
	MOVD	z+0(FP), R2
	MOVD	x+8(FP), R0
	MOVD	y+16(FP), R1

	// Load x into R3-R14
	// Subtract y from x and store result in R3-R14
	LDP	0(R0), (R3, R4)
	LDP	0(R1), (R15, R16)
	LDP	16(R0), (R5, R6)
	LDP	16(R1), (R17, R19)
	SUBS	R15, R3
	SBCS	R16, R4
	SBCS	R17, R5
	SBCS	R19, R6

	LDP	32(R0), (R7, R8)
	LDP	32(R1), (R15, R16)
	LDP	48(R0), (R9, R10)
	LDP	48(R1), (R17, R19)
	SBCS	R15, R7
	SBCS	R16, R8
	SBCS	R17, R9
	SBCS	R19, R10

	LDP	64(R0), (R11, R12)
	LDP	64(R1), (R15, R16)
	LDP	80(R0), (R13, R14)
	LDP	80(R1), (R17, R19)
	SBCS	R15, R11
	SBCS	R16, R12
	SBCS	R17, R13
	SBCS	R19, R14
	SBC	ZR, ZR, R15

	// If x - y < 0, R15 is 1 and 2 * p751 should be added
	LDP	·P751x2+0(SB), (R16, R17)
	AND	R15, R16
	AND	R15, R17
	LDP	·P751x2+40(SB), (R19, R20)
	AND	R15, R19
	AND	R15, R20

	ADDS	R16, R3
	ADCS	R17, R4
	STP	(R3, R4), 0(R2)
	ADCS	R17, R5
	ADCS	R17, R6
	STP	(R5, R6), 16(R2)
	ADCS	R17, R7
	ADCS	R19, R8
	STP	(R7, R8), 32(R2)
	ADCS	R20, R9

	LDP	·P751x2+56(SB), (R16, R17)
	AND	R15, R16
	AND	R15, R17
	LDP	·P751x2+72(SB), (R19, R20)
	AND	R15, R19
	AND	R15, R20

	ADCS	R16, R10
	STP	(R9, R10), 48(R2)
	ADCS	R17, R11
	ADCS	R19, R12
	STP	(R11, R12), 64(R2)
	ADCS	R20, R13

	MOVD	·P751x2+88(SB), R16
	AND	R15, R16
	ADC	R16, R14
	STP	(R13, R14), 80(R2)

	RET

TEXT ·adlP751(SB), NOSPLIT, $0-24
	MOVD	z+0(FP), R2
	MOVD	x+8(FP), R0
	MOVD	y+16(FP), R1

	LDP	0(R0), (R3, R4)
	LDP	0(R1), (R15, R16)
	LDP	16(R0), (R5, R6)
	LDP	16(R1), (R17, R19)
	ADDS	R15, R3
	ADCS	R16, R4
	STP	(R3, R4), 0(R2)
	ADCS	R17, R5
	ADCS	R19, R6
	STP	(R5, R6), 16(R2)

	LDP	32(R0), (R7, R8)
	LDP	32(R1), (R15, R16)
	LDP	48(R0), (R9, R10)
	LDP	48(R1), (R17, R19)
	ADCS	R15, R7
	ADCS	R16, R8
	STP	(R7, R8), 32(R2)
	ADCS	R17, R9
	ADCS	R19, R10
	STP	(R9, R10), 48(R2)

	LDP	64(R0), (R11, R12)
	LDP	64(R1), (R15, R16)
	LDP	80(R0), (R13, R14)
	LDP	80(R1), (R17, R19)
	ADCS	R15, R11
	ADCS	R16, R12
	STP	(R11, R12), 64(R2)
	ADCS	R17, R13
	ADCS	R19, R14
	STP	(R13, R14), 80(R2)

	LDP	96(R0), (R3, R4)
	LDP	96(R1), (R15, R16)
	LDP	112(R0), (R5, R6)
	LDP	112(R1), (R17, R19)
	ADCS	R15, R3
	ADCS	R16, R4
	STP	(R3, R4), 96(R2)
	ADCS	R17, R5
	ADCS	R19, R6
	STP	(R5, R6), 112(R2)

	LDP	128(R0), (R7, R8)
	LDP	128(R1), (R15, R16)
	LDP	144(R0), (R9, R10)
	LDP	144(R1), (R17, R19)
	ADCS	R15, R7
	ADCS	R16, R8
	STP	(R7, R8), 128(R2)
	ADCS	R17, R9
	ADCS	R19, R10
	STP	(R9, R10), 144(R2)

	LDP	160(R0), (R11, R12)
	LDP	160(R1), (R15, R16)
	LDP	176(R0), (R13, R14)
	LDP	176(R1), (R17, R19)
	ADCS	R15, R11
	ADCS	R16, R12
	STP	(R11, R12), 160(R2)
	ADCS	R17, R13
	ADC	R19, R14
	STP	(R13, R14), 176(R2)

	RET

TEXT ·sulP751(SB), NOSPLIT, $0-24
	MOVD	z+0(FP), R2
	MOVD	x+8(FP), R0
	MOVD	y+16(FP), R1

	LDP	0(R0), (R3, R4)
	LDP	0(R1), (R15, R16)
	LDP	16(R0), (R5, R6)
	LDP	16(R1), (R17, R19)
	SUBS	R15, R3
	SBCS	R16, R4
	STP	(R3, R4), 0(R2)
	SBCS	R17, R5
	SBCS	R19, R6
	STP	(R5, R6), 16(R2)

	LDP	32(R0), (R7, R8)
	LDP	32(R1), (R15, R16)
	LDP	48(R0), (R9, R10)
	LDP	48(R1), (R17, R19)
	SBCS	R15, R7
	SBCS	R16, R8
	STP	(R7, R8), 32(R2)
	SBCS	R17, R9
	SBCS	R19, R10
	STP	(R9, R10), 48(R2)

	LDP	64(R0), (R11, R12)
	LDP	64(R1), (R15, R16)
	LDP	80(R0), (R13, R14)
	LDP	80(R1), (R17, R19)
	SBCS	R15, R11
	SBCS	R16, R12
	STP	(R11, R12), 64(R2)
	SBCS	R17, R13
	SBCS	R19, R14
	STP	(R13, R14), 80(R2)

	LDP	96(R0), (R3, R4)
	LDP	96(R1), (R15, R16)
	LDP	112(R0), (R5, R6)
	LDP	112(R1), (R17, R19)
	SBCS	R15, R3
	SBCS	R16, R4
	SBCS	R17, R5
	SBCS	R19, R6

	LDP	128(R0), (R7, R8)
	LDP	128(R1), (R15, R16)
	LDP	144(R0), (R9, R10)
	LDP	144(R1), (R17, R19)
	SBCS	R15, R7
	SBCS	R16, R8
	SBCS	R17, R9
	SBCS	R19, R10

	LDP	160(R0), (R11, R12)
	LDP	160(R1), (R15, R16)
	LDP	176(R0), (R13, R14)
	LDP	176(R1), (R17, R19)
	SBCS	R15, R11
	SBCS	R16, R12
	SBCS	R17, R13
	SBCS	R19, R14
	SBC	ZR, ZR, R15

	// If x - y < 0, R15 is 1 and p751 should be added
	MOVD	·P751+0(SB), R20
	AND	R15, R20
	LDP	·P751+40(SB), (R16, R17)
	ADDS	R20, R3
	ADCS	R20, R4
	STP	(R3, R4), 96(R2)
	ADCS	R20, R5
	ADCS	R20, R6
	STP	(R5, R6), 112(R2)
	ADCS	R20, R7

	LDP	·P751+56(SB), (R19, R20)
	AND	R15, R16
	AND 	R15, R17
	ADCS	R16, R8
	STP	(R7, R8), 128(R2)
	ADCS	R17, R9

	LDP	·P751+72(SB), (R16, R17)
	AND	R15, R19
	AND	R15, R20
	ADCS	R19, R10
	STP	(R9, R10), 144(R2)
	ADCS	R20, R11

	MOVD	·P751+88(SB), R19
	AND 	R15, R16
	AND 	R15, R17
	ADCS	R16, R12
	STP	(R11, R12), 160(R2)
	ADCS	R17, R13

	AND	R15, R19
	ADC	R19, R14
	STP	(R13, R14), 176(R2)

	RET

// Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high)
// Z0 is not actually touched
// Result of (X0-X2) * (Y0-Y2) will be in Z0-Z5
// Inputs remain intact
#define mul192x192comba(X0, X1, X2, Y0, Y1, Y2, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3) \
	MUL	X1, Y0, T2	\
	UMULH	X1, Y0, T3	\
				\
	ADDS	Z3, Z1		\
	ADCS	ZR, Z2		\
	ADC	ZR, ZR, Z3	\
				\
	MUL	X0, Y2, T0	\
	UMULH	X0, Y2, T1	\
				\
	ADDS	T2, Z1		\
	ADCS	T3, Z2		\
	ADC	ZR, Z3		\
				\
	MUL	X1, Y1, T2	\
	UMULH	X1, Y1, T3	\
				\
	ADDS	T0, Z2		\
	ADCS	T1, Z3		\
	ADC	ZR, ZR, Z4	\
				\
	MUL	X2, Y0, T0	\
	UMULH	X2, Y0, T1	\
				\
	ADDS	T2, Z2		\
	ADCS	T3, Z3		\
	ADC	ZR, Z4		\
				\
	MUL	X1, Y2, T2	\
	UMULH	X1, Y2, T3	\
				\
	ADDS	T0, Z2		\
	ADCS	T1, Z3		\
	ADC	ZR, Z4		\
				\
	MUL	X2, Y1, T0	\
	UMULH	X2, Y1, T1	\
				\
	ADDS	T2, Z3		\
	ADCS	T3, Z4		\
	ADC	ZR, ZR, Z5	\
				\
	MUL	X2, Y2, T2	\
	UMULH	X2, Y2, T3	\
				\
	ADDS	T0, Z3		\
	ADCS	T1, Z4		\
	ADC	ZR, Z5		\
				\
	ADDS	T2, Z4		\
	ADC	T3, Z5

// Expects that X points to (X4-X6), Y to (Y4-Y6)
// Result of (X0-X5) * (Y0-Y5) will be in (0(Z), 8(Z), 16(Z), T0-T8)
// Inputs get overwritten
#define mul384x384karatsuba(X, Y, Z, X0, X1, X2, X3, X4, X5, Y0, Y1, Y2, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10)\
	ADDS	X0, X3		\	// xH + xL, destroys xH
	ADCS	X1, X4		\
	ADCS	X2, X5		\
	ADC	ZR, ZR, T10	\
				\
	ADDS	Y0, Y3		\	// yH + yL, destroys yH
	ADCS	Y1, Y4		\
	ADCS	Y2, Y5		\
	ADC	ZR, ZR, T6	\
				\
	SUB	T10, ZR, T7	\
	SUB	T6, ZR, T8	\
	AND	T6, T10		\	// combined carry
				\
	AND	T7, Y3, T0	\	// masked(yH + yL)
	AND	T7, Y4, T1	\
	AND	T7, Y5, T2	\
				\
	AND	T8, X3, T3	\	// masked(xH + xL)
	AND	T8, X4, T4	\
	AND	T8, X5, T5	\
				\
	ADDS	T3, T0		\
	ADCS	T4, T1		\
	STP	(T0, T1), 0+Z	\
				\
	MUL	X3, Y3, T0	\
	MUL	X3, Y4, T1	\
				\
	ADCS	T5, T2		\
	MOVD	T2, 16+Z	\
				\
	UMULH	X3, Y4, T2	\
	UMULH	X3, Y3, T3	\
				\
	ADC	ZR, T10		\
				\	// (xH + xL) * (yH + yL)
	mul192x192comba(X3, X4, X5, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
				\
	MUL	X0, Y0, X3	\
	LDP	0+Z, (T6, T7)	\
	MOVD	16+Z, T8	\
				\
	UMULH	X0, Y0, Y3	\
	ADDS	T6, T3		\
	ADCS	T7, T4		\
	MUL	X0, Y1, X4	\
	ADCS	T8, T5		\
	ADC	ZR, T10		\
	UMULH	X0, Y1, X5	\
				\	// xL * yL
	mul192x192comba(X0, X1, X2, Y0, Y1, Y2, X3, X4, X5, Y3, Y4, Y5, T6, T7, T8, T9)\
				\
	STP	(X3, X4), 0+Z	\
	MOVD	X5, 16+Z	\
				\
	SUBS	X3, T0		\	// (xH + xL) * (yH + yL) - xL * yL
	SBCS	X4, T1		\
	LDP	0+X, (X3, X4)	\
	SBCS	X5, T2		\
	MOVD	16+X, X5	\
	SBCS	Y3, T3		\
	SBCS	Y4, T4		\
	SBCS	Y5, T5		\
	SBC	ZR, T10		\
				\
	ADDS	Y3, T0		\	// ((xH + xL) * (yH + yL) - xL * yL) * 2^192 + xL * yL
	ADCS	Y4, T1		\
	LDP	0+Y, (Y3, Y4)	\
	MUL	X3, Y3, X0	\
	ADCS	Y5, T2		\
	UMULH	X3, Y3, Y0	\
	MOVD	16+Y, Y5	\
	MUL	X3, Y4, X1	\
	ADCS	ZR, T3		\
	UMULH	X3, Y4, X2	\
	ADCS	ZR, T4		\
	ADCS	ZR, T5		\
	ADC	ZR, T10		\
				\	// xH * yH, overwrite xLow, yLow
	mul192x192comba(X3, X4, X5, Y3, Y4, Y5, X0, X1, X2, Y0, Y1, Y2, T6, T7, T8, T9)\
				\
	SUBS	X0, T0		\	// ((xH + xL) * (yH + yL) - xL * yL - xH * yH)
	SBCS	X1, T1		\
	SBCS	X2, T2		\
	SBCS	Y0, T3		\
	SBCS	Y1, T4		\
	SBCS	Y2, T5		\
	SBC	ZR, T10		\
				\
	ADDS	X0, T3		\
	ADCS	X1, T4		\
	ADCS	X2, T5		\
	ADCS	T10, Y0, T6	\
	ADCS	ZR, Y1, T7	\
	ADC	ZR, Y2, T8


TEXT ·mulP751(SB), NOSPLIT, $0-24
	MOVD	z+0(FP), R2
	MOVD	x+8(FP), R0
	MOVD	y+16(FP), R1

	// Load xL in R3-R8, xH in R9-R14
	// (xH + xL) in R3-R8, destroys xH
	LDP	0(R0), (R3, R4)
	LDP	48(R0), (R9, R10)
	ADDS	R9, R3
	ADCS	R10, R4
	LDP	16(R0), (R5, R6)
	LDP	64(R0), (R11, R12)
	ADCS	R11, R5
	ADCS	R12, R6
	LDP	32(R0), (R7, R8)
	LDP	80(R0), (R13, R14)
	ADCS	R13, R7
	ADCS	R14, R8
	ADC	ZR, ZR, R22

	// Load yL in R9-R14, yH in R15-21
	// (yH + yL) in R9-R14, destroys yH
	LDP	0(R1), (R9, R10)
	LDP	48(R1), (R15, R16)
	ADDS	R15, R9
	ADCS	R16, R10
	LDP	16(R1), (R11, R12)
	LDP	64(R1), (R17, R19)
	ADCS	R17, R11
	ADCS	R19, R12
	LDP	32(R1), (R13, R14)
	LDP	80(R1), (R20, R21)
	ADCS	R20, R13
	ADCS	R21, R14
	ADC	ZR, ZR, R23

	// Compute masks and combined carry
	SUB	R22, ZR, R24
	SUB	R23, ZR, R25
	AND	R23, R22

	// Store xH, yH in z so mul384x384karatsuba can retrieve them from memory
	// It doesn't have enough registers
	// Meanwhile computed masked(xH + xL) in R15-R21
	STP	(R6, R7), 0(R2)
	AND	R25, R3, R15
	AND	R25, R4, R16
	STP	(R8, R12), 16(R2)
	AND	R25, R5, R17
	AND	R25, R6, R19
	STP	(R13, R14), 32(R2)
	AND	R25, R7, R20
	AND	R25, R8, R21

	// Masked(xH + xL) + masked(yH + yL) in R15-R21
	// Store intermediate values in z
	AND	R24, R9, R25
	AND	R24, R10, R26
	ADDS	R25, R15
	ADCS	R26, R16
	STP	(R15, R16), 96(R2)
	AND	R24, R11, R25
	AND	R24, R12, R26
	ADCS	R25, R17
	ADCS	R26, R19
	STP	(R17, R19), 112(R2)
	AND	R24, R13, R25
	AND	R24, R14, R26
	ADCS	R25, R20
	ADCS	R26, R21
	STP	(R20, R21), 128(R2)
	// Store carry in R29 so it can remain there
	ADC	ZR, R22, R29

	// (xH + xL) * (yH + yL)
	mul384x384karatsuba(0(R2), 24(R2), 48(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)

	// Load masked(xH + xL) + masked(yH + yL) and add that to its top half
	// Store the result back in z
	STP	(R15, R16), 72(R2)
	LDP	96(R2), (R3, R4)
	ADDS	R3, R19
	STP	(R17, R19), 88(R2)
	ADCS	R4, R20
	LDP	112(R2), (R5, R6)
	ADCS	R5, R21
	STP	(R20, R21), 104(R2)
	ADCS	R6, R22
	LDP	128(R2), (R7, R8)
	ADCS	R7, R23
	STP	(R22, R23), 120(R2)
	ADCS	R8, R24
	MOVD	R24, 136(R2)
	ADC	ZR, R29

	// Load xL, yL
	LDP	0(R0), (R3, R4)
	LDP	16(R0), (R5, R6)
	LDP	32(R0), (R7, R8)
	LDP	0(R1), (R9, R10)
	LDP	16(R1), (R11, R12)
	LDP	32(R1), (R13, R14)

	// xL * yL
	mul384x384karatsuba(24(R0), 24(R1), 0(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)

	// (xH + xL) * (yH + yL) - xL * yL in R3-R14
	LDP	0(R2), (R12, R13)
	LDP	48(R2), (R3, R4)
	SUBS	R12, R3
	LDP	64(R2), (R5, R6)
	MOVD	16(R2), R14
	SBCS	R13, R4
	SBCS	R14, R5
	LDP	80(R2), (R7, R8)
	SBCS	R15, R6
	SBCS	R16, R7
	LDP	96(R2), (R9, R10)
	SBCS	R17, R8
	SBCS	R19, R9
	LDP	112(R2), (R11, R12)
	SBCS	R20, R10
	SBCS	R21, R11
	LDP	128(R2), (R13, R14)
	SBCS	R22, R12
	SBCS	R23, R13
	SBCS	R24, R14
	SBC	ZR, R29

	STP	(R15, R16), 24(R2)
	MOVD	R17, 40(R2)

	// ((xH + xL) * (yH + yL) - xL * yL) * 2^384 + xL * yL and store back in z
	ADDS	R19, R3
	ADCS	R20, R4
	STP	(R3, R4), 48(R2)
	ADCS	R21, R5
	ADCS	R22, R6
	STP	(R5, R6), 64(R2)
	ADCS	R23, R7
	ADCS	R24, R8
	STP	(R7, R8), 80(R2)
	ADCS	ZR, R9
	ADCS	ZR, R10
	STP	(R9, R10), 96(R2)
	ADCS	ZR, R11
	ADCS	ZR, R12
	STP	(R11, R12), 112(R2)
	ADCS	ZR, R13
	ADCS	ZR, R14
	STP	(R13, R14), 128(R2)
	ADC	ZR, R29

	// Load xH, yH
	LDP	48(R0), (R3, R4)
	LDP	64(R0), (R5, R6)
	LDP	80(R0), (R7, R8)
	LDP	48(R1), (R9, R10)
	LDP	64(R1), (R11, R12)
	LDP	80(R1), (R13, R14)

	// xH * yH
	mul384x384karatsuba(72(R0), 72(R1), 144(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)

	LDP	144(R2), (R12, R13)
	MOVD	160(R2), R14

	// (xH + xL) * (yH + yL) - xL * yL - xH * yH in R3-R14
	// Store lower half in z, that's done
	LDP	48(R2), (R3, R4)
	SUBS	R12, R3
	LDP	64(R2), (R5, R6)
	SBCS	R13, R4
	SBCS	R14, R5
	LDP	80(R2), (R7, R8)
	SBCS	R15, R6
	SBCS	R16, R7
	LDP	96(R2), (R9, R10)
	SBCS	R17, R8
	SBCS	R19, R9
	LDP	112(R2), (R11, R12)
	SBCS	R20, R10
	SBCS	R21, R11
	LDP	128(R2), (R13, R14)
	SBCS	R22, R12
	SBCS	R23, R13
	STP	(R3, R4), 48(R2)
	SBCS	R24, R14
	STP	(R5, R6), 64(R2)
	SBC	ZR, R29
	STP	(R7, R8), 80(R2)

	// (xH * yH) * 2^768 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^384 + xL * yL
	// Store remaining limbs in z
	LDP	144(R2), (R3, R4)
	MOVD	160(R2), R5

	ADDS	R3, R9
	ADCS	R4, R10
	STP	(R9, R10), 96(R2)
	ADCS	R5, R11
	ADCS	R15, R12
	STP	(R11, R12), 112(R2)
	ADCS	R16, R13
	ADCS	R17, R14
	STP	(R13, R14), 128(R2)

	ADCS	R29, R19
	ADCS	ZR, R20
	STP	(R19, R20), 144(R2)
	ADCS	ZR, R21
	ADCS	ZR, R22
	STP	(R21, R22), 160(R2)
	ADCS	ZR, R23
	ADC	ZR, R24
	STP	(R23, R24), 176(R2)

	RET

TEXT ·rdcP751(SB), NOSPLIT, $0-16
	MOVD	z+0(FP), R0
	MOVD	x+8(FP), R1

	// Load p751+1 in R14-R17, R29, R19-R20, spread over arithmetic
	LDP	·P751p1+40(SB), (R14, R15)
	// z0-z11 will be R2-R13
	// Load x0-x4 to z0-z4 and x5, spread over arithmetic
	LDP	0(R1), (R2, R3)

	// x5 iteration
	MUL	R2, R14, R22
	LDP	32(R1), (R6, R21)
	UMULH	R2, R14, R23
	ADDS	R21, R22, R7	// Set z5
	ADC	ZR, R23, R25

	// x6 iteration
	MUL	R2, R15, R22
	MOVD	48(R1), R21
	UMULH	R2, R15, R23
	ADDS	R22, R25
	ADC	R23, ZR, R26

	MUL	R3, R14, R22
	LDP	·P751p1+56(SB), (R16, R17)
	UMULH	R3, R14, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, ZR, R24

	ADDS	R21, R25, R8	// Set z6
	ADCS	ZR, R26
	ADC	ZR, R24

	// x7 iteration
	MUL	R2, R16, R22
	MOVD	56(R1), R21
	UMULH	R2, R16, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, ZR, R25

	MUL	R3, R15, R22
	LDP	16(R1), (R4, R5)
	UMULH	R3, R15, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R4, R14, R22
	LDP	·P751p1+72(SB), (R29, R19)
	UMULH	R4, R14, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	ADDS	R21, R26, R9	// Set z7
	ADCS	ZR, R24
	ADC	ZR, R25

	// x8 iteration
	MUL	R2, R17, R22
	MOVD	64(R1), R21
	UMULH	R2, R17, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, ZR, R26

	MUL	R3, R16, R22
	MOVD	·P751p1+88(SB), R20
	UMULH	R3, R16, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R4, R15, R22
	UMULH	R4, R15, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R5, R14, R22
	UMULH	R5, R14, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	ADDS	R24, R21, R10	// Set z8
	ADCS	ZR, R25
	ADC	ZR, R26

	// x9 iteration
	MUL	R2, R29, R22
	MOVD	72(R1), R21
	UMULH	R2, R29, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, ZR, R24

	MUL	R3, R17, R22
	UMULH	R3, R17, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R4, R16, R22
	UMULH	R4, R16, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R5, R15, R22
	UMULH	R5, R15, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R6, R14, R22
	UMULH	R6, R14, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	ADDS	R21, R25, R11	// Set z9
	ADCS	ZR, R26
	ADC	ZR, R24

	// x10 iteration
	MUL	R2, R19, R22
	MOVD	80(R1), R21
	UMULH	R2, R19, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, ZR, R25

	MUL	R3, R29, R22
	UMULH	R3, R29, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R4, R17, R22
	UMULH	R4, R17, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R5, R16, R22
	UMULH	R5, R16, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R6, R15, R22
	UMULH	R6, R15, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R7, R14, R22
	UMULH	R7, R14, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	ADDS	R21, R26, R12	// Set z10
	ADCS	ZR, R24
	ADC	ZR, R25

	// x11 iteration
	MUL	R2, R20, R22
	MOVD	88(R1), R21
	UMULH	R2, R20, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, ZR, R26

	MUL	R3, R19, R22
	UMULH	R3, R19, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R4, R29, R22
	UMULH	R4, R29, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R5, R17, R22
	UMULH	R5, R17, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R6, R16, R22
	UMULH	R6, R16, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R7, R15, R22
	UMULH	R7, R15, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R8, R14, R22
	UMULH	R8, R14, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	ADDS	R21, R24, R13	// Set z11
	ADCS	ZR, R25
	ADC	ZR, R26

	// x12 iteration
	MUL	R3, R20, R22
	MOVD	96(R1), R21
	UMULH	R3, R20, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, ZR, R24

	MUL	R4, R19, R22
	UMULH	R4, R19, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R5, R29, R22
	UMULH	R5, R29, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R6, R17, R22
	UMULH	R6, R17, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R7, R16, R22
	UMULH	R7, R16, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R8, R15, R22
	UMULH	R8, R15, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R9, R14, R22
	UMULH	R9, R14, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	ADDS	R21, R25, R2	// Set z0
	ADCS	ZR, R26
	ADC	ZR, R24

	// x13 iteration
	MUL	R4, R20, R22
	MOVD	104(R1), R21
	UMULH	R4, R20, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, ZR, R25

	MUL	R5, R19, R22
	UMULH	R5, R19, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R6, R29, R22
	UMULH	R6, R29, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R7, R17, R22
	UMULH	R7, R17, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R8, R16, R22
	UMULH	R8, R16, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R9, R15, R22
	UMULH	R9, R15, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R10, R14, R22
	UMULH	R10, R14, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	ADDS	R21, R26, R3	// Set z1
	STP	(R2, R3), 0(R0)
	ADCS	ZR, R24
	ADC	ZR, R25

	// x14 iteration
	MUL	R5, R20, R22
	MOVD	112(R1), R21
	UMULH	R5, R20, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, ZR, R26

	MUL	R6, R19, R22
	UMULH	R6, R19, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R7, R29, R22
	UMULH	R7, R29, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R8, R17, R22
	UMULH	R8, R17, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R9, R16, R22
	UMULH	R9, R16, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R10, R15, R22
	UMULH	R10, R15, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R11, R14, R22
	UMULH	R11, R14, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	ADDS	R21, R24, R4	// Set z2
	ADCS	ZR, R25
	ADC	ZR, R26

	// x15 iteration
	MUL	R6, R20, R22
	MOVD	120(R1), R21
	UMULH	R6, R20, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, ZR, R24

	MUL	R7, R19, R22
	UMULH	R7, R19, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R8, R29, R22
	UMULH	R8, R29, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R9, R17, R22
	UMULH	R9, R17, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R10, R16, R22
	UMULH	R10, R16, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R11, R15, R22
	UMULH	R11, R15, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R12, R14, R22
	UMULH	R12, R14, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	ADDS	R21, R25, R5	// Set z3
	STP	(R4, R5), 16(R0)
	ADCS	ZR, R26
	ADC	ZR, R24

	// x16 iteration
	MUL	R7, R20, R22
	MOVD	128(R1), R21
	UMULH	R7, R20, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, ZR, R25

	MUL	R8, R19, R22
	UMULH	R8, R19, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R9, R29, R22
	UMULH	R9, R29, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R10, R17, R22
	UMULH	R10, R17, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R11, R16, R22
	UMULH	R11, R16, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R12, R15, R22
	UMULH	R12, R15, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R13, R14, R22
	UMULH	R13, R14, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	ADDS	R21, R26, R6	// Set z4
	ADCS	ZR, R24
	ADC	ZR, R25

	// x17 iteration
	MUL	R8, R20, R22
	MOVD	136(R1), R21
	UMULH	R8, R20, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, ZR, R26

	MUL	R9, R19, R22
	UMULH	R9, R19, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R10, R29, R22
	UMULH	R10, R29, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R11, R17, R22
	UMULH	R11, R17, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R12, R16, R22
	UMULH	R12, R16, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R13, R15, R22
	UMULH	R13, R15, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	ADDS	R21, R24, R7	// Set z5
	STP	(R6, R7), 32(R0)
	ADCS	ZR, R25
	ADC	ZR, R26

	// x18 iteration
	MUL	R9, R20, R22
	MOVD	144(R1), R21
	UMULH	R9, R20, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, ZR, R24

	MUL	R10, R19, R22
	UMULH	R10, R19, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R11, R29, R22
	UMULH	R11, R29, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R12, R17, R22
	UMULH	R12, R17, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	MUL	R13, R16, R22
	UMULH	R13, R16, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	ADDS	R21, R25, R8	// Set z6
	ADCS	ZR, R26
	ADC	ZR, R24

	// x19 iteration
	MUL	R10, R20, R22
	MOVD	152(R1), R21
	UMULH	R10, R20, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, ZR, R25

	MUL	R11, R19, R22
	UMULH	R11, R19, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R12, R29, R22
	UMULH	R12, R29, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	MUL	R13, R17, R22
	UMULH	R13, R17, R23
	ADDS	R22, R26
	ADCS	R23, R24
	ADC	ZR, R25

	ADDS	R21, R26, R9	// Set z7
	STP	(R8, R9), 48(R0)
	ADCS	ZR, R24
	ADC	ZR, R25

	// x20 iteration
	MUL	R11, R20, R22
	MOVD	160(R1), R21
	UMULH	R11, R20, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, ZR, R26

	MUL	R12, R19, R22
	UMULH	R12, R19, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	MUL	R13, R29, R22
	UMULH	R13, R29, R23
	ADDS	R22, R24
	ADCS	R23, R25
	ADC	ZR, R26

	ADDS	R21, R24, R10	// Set z8
	ADCS	ZR, R25
	ADC	ZR, R26

	// x21 iteration
	MUL	R12, R20, R22
	MOVD	168(R1), R21
	UMULH	R12, R20, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, ZR, R24

	MUL	R13, R19, R22
	UMULH	R13, R19, R23
	ADDS	R22, R25
	ADCS	R23, R26
	ADC	ZR, R24

	ADDS	R21, R25, R11	// Set z9
	STP	(R10, R11), 64(R0)
	ADCS	ZR, R26
	ADC	ZR, R24

	// x22 iteration
	MUL	R13, R20, R22
	MOVD	176(R1), R21
	UMULH	R13, R20, R23
	ADDS	R22, R26
	ADC	R23, R24
	ADDS	R21, R26, R12	// Set z10

	MOVD	184(R1), R21
	ADC	R21, R24, R13	// Set z11
	STP	(R12, R13), 80(R0)

	RET

TEXT ·modP751(SB), NOSPLIT, $0-8
	MOVD	x+0(FP), R0

	// Keep x in R1-R12, p751 in R13-R21, subtract to R1-R12
	MOVD	·P751+0(SB), R13
	LDP	0(R0), (R1, R2)
	LDP	16(R0), (R3, R4)
	SUBS	R13, R1
	SBCS	R13, R2

	LDP	32(R0), (R5, R6)
	LDP	·P751+40(SB), (R14, R15)
	SBCS	R13, R3
	SBCS	R13, R4

	LDP	48(R0), (R7, R8)
	LDP	·P751+56(SB), (R16, R17)
	SBCS	R13, R5
	SBCS	R14, R6

	LDP	64(R0), (R9, R10)
	LDP	·P751+72(SB), (R19, R20)
	SBCS	R15, R7
	SBCS	R16, R8

	LDP	80(R0), (R11, R12)
	MOVD	·P751+88(SB), R21
	SBCS	R17, R9
	SBCS	R19, R10

	SBCS	R20, R11
	SBCS	R21, R12
	SBC	ZR, ZR, R22

	// Mask with the borrow and add p751
	AND	R22, R13
	AND	R22, R14
	AND	R22, R15
	AND	R22, R16
	AND	R22, R17
	AND	R22, R19
	AND	R22, R20
	AND	R22, R21

	ADDS	R13, R1
	ADCS	R13, R2
	STP 	(R1, R2), 0(R0)
	ADCS	R13, R3
	ADCS	R13, R4
	STP 	(R3, R4), 16(R0)
	ADCS	R13, R5
	ADCS	R14, R6
	STP 	(R5, R6), 32(R0)
	ADCS	R15, R7
	ADCS	R16, R8
	STP 	(R7, R8), 48(R0)
	ADCS	R17, R9
	ADCS	R19, R10
	STP 	(R9, R10), 64(R0)
	ADCS	R20, R11
	ADC	R21, R12
	STP 	(R11, R12), 80(R0)

	RET
